# Keep things nice and tidy, all libraries go here
library(readxl)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(svglite)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggsci)
data <- read_excel("data_IEEE.xlsx", skip = 1)
## New names:
## * `` -> ...35
data <- data %>% filter(is.na(Exclude)) 
# Years without any publication (for easy slicing)
years_no_publications <- c("1974", "1975", "1976", "1978")

# LABELS so slicing will not become a mess
swebok_areas_labels = c("SR", "SD", "SC", "ST", "SM", "SCM", "SEM", "SEP", "SEMM", "SQ", "SEPP", "SEE", "CF", "MF", "EF")
swebok_areas_labels_no_foundation = c("SR", "SD", "SC", "ST", "SM", "SCM", "SEM", "SEP", "SEMM", "SQ", "SEPP", "SEE")
swebok_areas_labels_long = c("Requirements", "Design", "Construction", "Testing", "Maintainance", "Config. Mgmt.", "SE Mgmt.", "SE Processes", "SE Models&Methods", "Software Quality", "SE Prof. Practice", "SE Economics")

cognitive_concepts_labels <- c( "Attention", "Selective attention", "Divided attention", "Sustained attention", "Memory", "Working memory", "Short-term memory", "Long-term memory", "Cognitive load", "Intrinsic CL", "Extrinsic CL", "Perception", "Problem solving", "Reasoning", "Decision making", "Cognitive biases", "Knowledge", "Explicit knowledge", "Tacit knowledge", "Techn. tacit knowl.", "Cogn. tacit knowl.", "Cognitive control", "Social Cognition")

measures_labels <- c("Qualit. measures", "Fieldwork", "Interview", "Task-based", "Open observation", "Quantit. measures", "Task performance", "Physiological meas.", "Subjective ratings", "Behavioral meas.")

# COLORS 
tol9qualitative=c("#332288", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#CC6677", "#882255", "#AA4499")
NPG_modified=c("#F5E144", "#4DBBD5FF", "#00A087FF", "#3C5488FF", "#F39B7FFF", "#8491B4FF", "#91D1C2FF", "#DC0000FF", "#7E6148FF")

# Necessary for groupying by high-level category
add_high_level_concepts_to_data = function(data) {data %>%  mutate(Concept = case_when(Taxonomy %in% c("Attention", "Selective attention", "Divided attention", "Sustained attention") ~ "Attention", 
                                                                                Taxonomy %in% c("Memory", "Working memory", "Short-term memory", "Long-term memory") ~ "Memory", 
                                                                                Taxonomy %in% c("Cognitive load", "Extrinsic CL", "Intrinsic CL") ~ "Cognitive load", 
                                                                                Taxonomy == "Perception" ~ "Perception", 
                                                                                Taxonomy %in% c("Problem solving", "Reasoning", "Decision making") ~ "Reasoning", 
                                                                                Taxonomy %in% c("Cognitive biases") ~ "Cognitive biases", 
                                                                                Taxonomy %in% c("Knowledge", "Explicit knowledge", "Tacit knowledge",  "Techn. tacit knowl.", "Cogn. tacit knowl.") ~ "Knowledge", 
                                                                                Taxonomy %in% c("Cognitive control") ~ "Cognitive control", 
                                                                                Taxonomy=="Social Cognition" ~ "Social cognition")) }

Visualizing number of publications over time

ggplot(data, aes(x=as.factor(Year))) + 
  geom_bar() +  
  ylab("Number of publications") +
  xlab("Year") + 
  geom_text(stat='count', aes(label=..count..), vjust=2, color="white", size = 2.5) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

ggsave("yearly_distribution.eps")
## Saving 7 x 5 in image
# Cleaning not needed values 
data<-data %>% 
    mutate(Academia = replace(Academia, Academia == "?", NA)) %>%
    mutate(Industry = replace(Industry, Industry == "?", NA))

data<-data %>%
  mutate(Type = case_when(is.na(Academia) & is.na(Industry) ~ "None", 
                       Academia == "1" & is.na(Industry) ~ "Academia",
                       Industry == "1" & is.na(Academia) ~ "Industry", 
                       TRUE ~ "Both"))

Number of publications according to their type

data %>% 
  mutate(Type = fct_infreq(Type, ordered = T)) %>% 
ggplot(aes(x=Type)) + 
  geom_bar(width = .5) +
  xlab("Type of publication") + 
  ylab("Number of publications") + 
  geom_text(stat='count', aes(label=..count..), vjust=3, color="white", size = 4) +
  theme_bw()

ggsave("academia_industry_distribution.pdf")
## Saving 7 x 5 in image

Number of publications categorized according to SWEBoK Areas.

A publication can be in more than one category at the same time.

data %>% 
  select(all_of(swebok_areas_labels)) %>% # selecting columns corresponding to the SWEBoK Areas
  mutate_all(replace_na,0) %>% 
  summarise_all(sum) %>% 
  gather(key = "SWEBOKArea", value = "publications", 1:15) %>% 
  arrange(-publications) %>%  
  mutate(SWEBOKArea = factor(SWEBOKArea, SWEBOKArea)) %>% 
  ggplot(aes(x=SWEBOKArea, y=publications)) + 
  geom_bar(stat="identity") + 
  geom_text(aes(label=publications), vjust=-0.3, color="black", size = 4) + 
  xlab("SWEBoK Area") + 
  ylab("Number of publications") +
  theme_bw()

ggsave("swebok_distribution.pdf")
## Saving 7 x 5 in image

Co-occurrences of SWEBoK Areas

swebokareas<-data %>% 
  select(all_of(swebok_areas_labels)) %>% # selecting columns corresponding to the SWEBoK Areas
  mutate_all(replace_na,0) %>% 
  as.matrix() %>% 
  crossprod()

swebokareas %>% 
  kable()
SR SD SC ST SM SCM SEM SEP SEMM SQ SEPP SEE CF MF EF
SR 49 18 5 2 4 0 7 2 4 0 7 0 0 0 1
SD 18 66 17 3 4 0 6 2 6 1 6 0 0 0 1
SC 5 17 77 5 22 1 3 2 2 0 3 0 0 0 0
ST 2 3 5 12 4 0 1 0 0 0 0 0 0 0 0
SM 4 4 22 4 46 1 2 1 0 0 1 0 0 0 0
SCM 0 0 1 0 1 2 0 1 0 0 0 0 0 0 0
SEM 7 6 3 1 2 0 26 3 1 0 7 3 0 0 1
SEP 2 2 2 0 1 1 3 10 0 0 2 1 0 0 0
SEMM 4 6 2 0 0 0 1 0 8 0 1 0 0 0 0
SQ 0 1 0 0 0 0 0 0 0 6 0 0 0 0 0
SEPP 7 6 3 0 1 0 7 2 1 0 18 3 0 0 1
SEE 0 0 0 0 0 0 3 1 0 0 3 5 0 0 0
CF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
MF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
EF 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1
plot_ly(x=swebok_areas_labels, y=swebok_areas_labels, z=swebokareas, type="heatmap")
x <- data %>% select(all_of(swebok_areas_labels), all_of(cognitive_concepts_labels)) %>%
  mutate_all(replace_na, 0) %>%
  mutate(`Problem solving`, `Problem solving` = as.numeric(`Problem solving`)) %>% 
  gather(key="SWEBOK", value = pubs, swebok_areas_labels) %>% # use SWEBOK area as factor
  filter(pubs > 0) %>% # select areas for which there are publications
  group_by(SWEBOK) %>% 
  summarise_all(sum) %>% # number of publication for each area 
  select(-pubs) %>%  # remove pubs to reuse it later
  gather(key = "Taxonomy", value = "count", cognitive_concepts_labels) %>%  # count publications in each cognitive taxonomy area
  mutate(label = str_replace(as.character(count), "^0", "")) # add label for later
## Warning: NAs introduced by coercion
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(swebok_areas_labels)` instead of `swebok_areas_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cognitive_concepts_labels)` instead of `cognitive_concepts_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# Bubble plot
x <- arrange(x, Taxonomy)
xf<-x$Taxonomy
xfu<-unique(xf)
x$Taxonomy<-factor(xf,levels=xfu)

p<-ggplot(x)
p + geom_point(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), size=count), shape=21, fill="white", alpha=0.60) +
geom_text(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), label=label), size=2) +
theme(axis.text.x = element_text(angle = 45, hjust = 1.1, size=9,colour="black"), axis.text.y = element_text(size=8,colour="black"), axis.title.x = element_text(size=10), axis.title.y = element_text(size=10,colour = "black",vjust=0.12), panel.grid.major = element_line(linetype = "dashed", size=0.1, color="black"))+
  labs(x="SWEBOK Area",y = "Taxonomy Area") + theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).

ggsave("swebok_taxonomy_bubble.pdf")
## Saving 7 x 5 in image
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_text).
# Preparing the dataset for analysing the research methods
data<-data %>%
  mutate(Quantitative = case_when(`Quantit. measures` == 1 | `Task performance` == 1 | `Physiological meas.` == 1 | `Subjective ratings` == 1 | `Behavioral meas.` == 1 ~ 1)) %>% 
  mutate(Quantitative = replace_na(Quantitative, 0)) %>%  
  mutate(Qualitative = case_when(Fieldwork == 1 | Interview == 1 | `Qualit. measures` == 1 | `Task-based` == 1 | `Open observation` == 1 ~ 1)) %>%  
  mutate(Qualitative = replace_na(Qualitative, 0)) %>% 
  mutate(Both = if_else(Qualitative == 1 & Quantitative == 1, 1, 0))

The graphs below are prepared for IEEE Software Submission

Number of publications per year according to SWEBOK areas

# Creating a temp dataset with missing publications years (i.e., year for which there was no publication)
data %>% 
  filter(is.na(Exclude)) %>% 
  select(c(Year, SR:EF)) %>% 
  gather("SWEBOK", "publications", 2:16) %>% 
  mutate_all(replace_na, 0) %>%
  group_by(Year,SWEBOK)  %>% 
  summarise(total=sum(publications)) %>% 
  ggplot(aes(x=as.factor(Year), fill=SWEBOK, y=total)) +  geom_bar(stat="sum") +
  xlab("Year") + ylab("Publications") + scale_fill_discrete(name = "SWEBOK Areas") + guides(size = F) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

ggsave("years_swebok.pdf")
## Saving 7 x 5 in image

Evolution of research methods over the years

data <-  data %>% complete(Year=seq(1973,2016)) 

data <-  data %>% 
  mutate(research_method = if_else(Both==1, "Mixed", if_else(Qualitative==1, "Qualitative", "Quantitative"))) 

data %>%  ggplot(aes(x=as.factor(Year), fill=research_method)) + geom_bar() + 
  scale_fill_discrete(name="Research method", labels = c("Mixed", "Qualitative", "Quantitative", ""), na.value = "transparent") +
  xlab("Year") + ylab("Publications") +   
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

ggsave("years_researchmethods.pdf")
## Saving 7 x 5 in image

Prevalence of research methods in the SWEBOK areas

data.swebok.researchmethod <- data %>% 
  select(swebok_areas_labels, research_method) %>% 
  mutate_all(replace_na,0) %>% 
  group_by(research_method) %>% 
  summarise_at(vars(swebok_areas_labels), sum) %>% 
  gather("SWEBOK", "Publications", swebok_areas_labels) 

data.swebok.researchmethod %>% 
  ggplot(aes(x=reorder(SWEBOK, Publications, function(x){sum(x)}), y=Publications, fill=research_method)) + geom_bar(stat = "identity") + 
  coord_flip() + xlab("SWEBOK areas") + scale_fill_discrete(name = "Research method")

ggsave("SWBOK_researchmethods.pdf")
## Saving 7 x 5 in image

Distribution of publications

data %>% 
  filter(!is.na(Identifier)) %>%
  select(Identifier, all_of(cognitive_concepts_labels), measures_labels) %>% 
  gather(Taxonomy, value, all_of(cognitive_concepts_labels)) %>% 
  filter(!is.na(value)) %>% 
  select(-value) %>% 
  gather(Method, value, measures_labels) %>% 
  filter(!is.na(value)) %>% 
  arrange(Identifier) %>% 
  select(-Identifier, -value) %>%  
  group_by(Taxonomy, Method) %>% 
  tally(name = "Amount") %>% 
  ggplot(aes(x=Method, y=Taxonomy, fill=Amount)) + 
  geom_point(aes(size=Amount), alpha=0.5) + 
  theme(legend.position = "") + theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(measures_labels)` instead of `measures_labels` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

ggsave("taxonomy_methods.pdf")
## Saving 7 x 5 in image
data %>% 
  select(all_of(swebok_areas_labels_no_foundation), all_of(cognitive_concepts_labels)) %>% 
  mutate_all(replace_na,0) %>% 
  gather(Taxonomy, value2, cognitive_concepts_labels) %>% 
  add_high_level_concepts_to_data() %>% 
  gather(SWEBOK, value, swebok_areas_labels_no_foundation) %>% 
  count(SWEBOK, Concept, value, value2) %>% 
  mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>% 
  distinct(SWEBOK, Concept, freq) %>% 
  group_by(SWEBOK, Concept) %>% 
  summarize(total=sum(freq)) %>% 
  ungroup() %>% 
  ggplot(aes(fct_relevel(SWEBOK, swebok_areas_labels_no_foundation), fct_rev(Concept), fill=total)) + 
  geom_tile() + scale_fill_continuous(low="#fff9f7", high="red") +
  xlab("SWEBOK area") + ylab("Concept") + guides(fill=guide_legend(title="")) + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8)) +
  scale_x_discrete(labels = swebok_areas_labels_long)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(swebok_areas_labels_no_foundation)` instead of `swebok_areas_labels_no_foundation` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

ggsave("taxomony_swebok_cooccurences.pdf")
## Saving 7 x 5 in image
data %>% 
  select(cognitive_concepts_labels,  measures_labels) %>% 
  mutate_all(replace_na,0) %>% 
  gather(Taxonomy, value, cognitive_concepts_labels) %>% 
  add_high_level_concepts_to_data() %>% 
  gather(Method, value2, measures_labels) %>% 
  count(Concept, Method, value, value2) %>%
  mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>% 
  ggplot(aes(fct_relevel(Method, measures_labels), fct_rev(Concept), fill=freq)) + 
  geom_tile() +
  geom_vline(xintercept = 5.5, size=0.5,  color="darkgrey") +
  xlab("Assessment procedure") + ylab("Concept") + guides(fill=guide_legend(title="")) + 
  scale_x_discrete(labels=c("Fieldwork", "Interview", "Task-based", "Open observation", "Others", "Task performance", "Physiological meas.", "Subjective ratings", "Behavioral meas.", "Others")) + # not using measure_lables here since we need a catch-all "Others" category
  annotate(geom="text", x=8, y=0.73, label="Quantitative", size=3, alpha=0.4)+ annotate(geom="text", x=3, y=0.73, label="Qualitative", size=3, alpha=0.4) +
  scale_fill_continuous(low="#fff9f7", high="darkgreen") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))

ggsave("taxonomy_method_cooccurences.pdf")
## Saving 7 x 5 in image
data %>% 
  select(Year, cognitive_concepts_labels)%>% 
  gather("Taxonomy", "publications", cognitive_concepts_labels) %>%
  mutate_all(replace_na,0) %>% 
  mutate(publications=as.integer(publications)) %>% 
  group_by(Year, Taxonomy) %>% 
  summarise(total=sum(publications)) %>% 
  ggplot(aes(as.factor(Year), total, fill=Taxonomy)) + geom_bar(stat="sum") +  xlab("Year") + ylab("Publications") + 
  scale_fill_discrete(name = "Taxonomy Areas") + guides(size = F) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))
## Warning: NAs introduced by coercion
## Warning: Removed 1 rows containing non-finite values (stat_sum).

 df.taxonomy  <- data %>% 
  select(Year, all_of(cognitive_concepts_labels)) %>%
  gather("Taxonomy", "publications", cognitive_concepts_labels) %>% 
  mutate_all(replace_na,0) %>% 
  mutate(publications=as.integer(publications)) %>% # for some reseason recognized as char
  filter(publications>0)
## Warning: NAs introduced by coercion
# need to create a separated df to hold the percentage of publications within each year
data.percentage <-  df.taxonomy  %>% 
  group_by(Year) %>% 
  count(Taxonomy) %>% 
  mutate(ratio = scales::percent(n/sum(n)))

df.taxonomy %>% 
  ggplot(aes(x = as.factor(Year), fill = as.factor(Taxonomy))) + 
  geom_bar(position="fill") +  
  geom_text(data = data.percentage, aes(y = n,label = ratio), position = position_fill(vjust = 0.5), colour = "white", size = 1.3) + 
  xlab("Year") + ylab("Publications %") +
  scale_fill_discrete(name = "Topic") + guides(size = F) +
  scale_y_continuous(labels = percent) + 
  theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) + 
  theme(legend.key.size = unit(.2, "cm"), legend.key.width = unit(0.2,"cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 6))

ggsave("taxonomy_years.pdf", width = unit(9, "inch"), height = unit(6.5, "inch"))
df.concepts <- df.taxonomy %>% 
  add_high_level_concepts_to_data() 
df.years <- data %>% filter(!(Year %in% years_no_publications)) %>% count(Year) # years without publications

ggplot() + 
  geom_bar(data=df.concepts, aes(x=as.factor(Year), fill=Concept), position="fill") + 
  geom_line(data=df.years, aes(x=as.factor(Year), y=n/max(n), group=1), size=0.8) + 
  geom_point(data=df.years, aes(x=as.factor(Year), y=n/max(n), group=1)) + 
  scale_y_continuous(labels = function(x)x*100, name="Publication %", sec.axis = sec_axis(name="Total publications", ~. * max(df.years$n), breaks=scales::breaks_extended(10))) + 
  xlab("Year")  + 
  theme(panel.background = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_text(margin = margin(-15,0,0,0, "pt")), axis.text.x = element_text(angle = 45, hjust = 1, size = 8, vjust = 2.4)) + 
  scale_fill_manual(values = NPG_modified) 

ggsave("concepts_years.pdf", width = unit(13, "inch"), height = unit(6.5, "inch"))